## This code and analyses were produced Jason Anastasopoulos (j.andronici@gmail.com)
## and Anthony Bertelli for the paper "Understanding Delegation Through Machine Learning: A Methods and Application to the EU"

## Here we train a series of gradient boosted tree classifiers on the hand coded delegation and constraint data,
## apply to classifier to the provisions, and calculate delegation and constraint ratios 
## Each of the trained classifiers are saved in a .RData file


library(pacman)


# This loads and installs the packages you need at once
pacman::p_load(tm,SnowballC,foreign,plyr,twitteR,slam,foreign,wordcloud,LiblineaR,e1071, topicmodels,readr,
               monkeylearn, EBglmnet, bayesreg, ggplot2,randomForest,
               glmnet, monomvn, caret, rpart, xgboost, boot,dplyr, ranger,
               xgboost,quanteda)


##############################################################################################################
##############################################################################################################
##############################################################################################################
######################### Training ###########################################################################
##############################################################################################################
##############################################################################################################
##############################################################################################################



data = "~/Training Data/Training_Data_MS.csv"

textdata<-read.csv(data)

provision.text = textdata$Text

# Let's create two categories of constraints
# Constraints to members states

#[10] "MS.Time.Limit"                    "MS.Reporting.Requirements"        "MS.Consultation.Requirements"    
#[13] "MS.Appeals.Procedures"            "MS.Executive.Action.Required"     "MS.Legislative.Action.Required"  
#[16] "MS.Spending.Limits"               "MS.Executive.Action.Possible"     "MS.Exemptions" 

# Need to train a classifier for each of these categories of constraints and then we need to report
# the statistics for each of these

constraints.member<-textdata[,9:20]
delegation.member<-textdata$Lable


data.constraints.MS<-data.frame(provision.text,constraints.member)
data.delegation.MS<-data.frame(provision.text,delegation.member)

# Use regular expressions to clean up some elements of the documents
cleandocs<-c()


# Preprocess the text using quanteda

# Now we have to put the training data and classification data into one matrix

provisiontext<-sapply(textdata$Text,as.character)

provisiontext = corpus(provisiontext) # Create a corpus object

token.dirty  = tokens(provisiontext,ngrams = 1:2)
token.clean = tokens_select(token.dirty, 
                            c("/","@", "\\|","#","http","https" ,".com","$", " g "),
                            selection ="remove")
                            
dtm.new = dfm(token.clean, remove = stopwords("english"), 
              remove_punct = TRUE,stem = TRUE)

dtm.new  = dfm_trim(dtm.new,sparsity=.99)

data.constraints.MS.new<-data.constraints.MS
data.delegation.MS.new<-data.delegation.MS



############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
##################################    Classifier Training Performance Constraint ###########################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################

# We have to train 12 different classifiers for constraint, safe the classifiers as objects, report performance of the
# classifiers and word importance plots 

#"MS.Rulemaking.Requirements"      
#[10] "MS.Time.Limit"                    "MS.Reporting.Requirements"        "MS.Consultation.Requirements"    
#[13] "MS.Appeals.Procedures"            "MS.Executive.Action.Required"     "MS.Legislative.Action.Required"  
#[16] "MS.Spending.Limits"               "MS.Executive.Action.Possible"     "MS.Exemptions"                   
#[19] "MS.Public.Hearings"               "MS.Legislative.Action.Possible"


# Constraint names
#constraint.names.upper = names(data.constraints.MS.new[,2:13])
#constraint.names.lower = tolower(names(data.constraints.MS.new[,2:13]))
#constraint.names.mat = data.frame(rep(0,dim(data.constraints.MS.new)[1]))

#[1] "ms.rulemaking.requirements"     "ms.time.limit"                  "ms.reporting.requirements"     
#[4] "ms.consultation.requirements"   "ms.appeals.procedures"          "ms.executive.action.required"  
#[7] "ms.legislative.action.required" "ms.spending.limits"             "ms.executive.action.possible"  
#[10] "ms.exemptions"                  "ms.public.hearings"             "ms.legislative.action.possible"

# List of the constraints, delegation ratio is going to have to have a denominator of 6
ms.rulemaking.requirements<- data.constraints.MS.new$MS.Rulemaking.Requirements
ms.rulemaking.requirements<- ifelse(as.numeric(ms.rulemaking.requirements) >= 2, 1,0)

ms.time.limit<-data.constraints.MS.new$MS.Time.Limit
ms.time.limit<-ifelse(ms.time.limit >= 1, 1,0)

ms.reporting.requirements<-data.constraints.MS.new$MS.Reporting.Requirements
ms.reporting.requirements<-ifelse(ms.reporting.requirements >= 1, 1,0)

ms.consultation.requirements<-data.constraints.MS.new$MS.Consultation.Requirements
ms.consultation.requirements<-ifelse(ms.consultation.requirements >= 1, 1,0)

ms.appeals.procedures<-data.constraints.MS.new$MS.Appeals.Procedures
ms.appeals.procedures<-ifelse(ms.appeals.procedures >= 1, 1,0)

ms.executive.action.required<-data.constraints.MS.new$MS.Executive.Action.Required
ms.executive.action.required<-ifelse(ms.executive.action.required >= 1, 1,0) # Unusable

ms.legislative.action.required<-data.constraints.MS.new$MS.Legislative.Action.Required
ms.legislative.action.required<-ifelse(ms.legislative.action.required >= 1, 1,0) # Unusable

ms.spending.limits<-data.constraints.MS.new$MS.Spending.Limits
ms.spending.limits<-ifelse(ms.spending.limits >= 1, 1,0)

ms.executive.action.possible<-data.constraints.MS.new$MS.Executive.Action.Possible
ms.executive.action.possible<-ifelse(ms.executive.action.possible >= 1, 1,0) # Unusable

ms.exemptions<-data.constraints.MS.new$MS.Exemptions
ms.exemptions<-ifelse(ms.exemptions >= 1, 1,0) # Unusable

ms.public.hearings<-data.constraints.MS.new$MS.Public.Hearings
ms.public.hearings<-ifelse(ms.public.hearings >= 1, 1,0) # Unusable

ms.legislative.action.possible<-data.constraints.MS.new$MS.Legislative.Action.Possible
ms.legislative.action.possible<-ifelse(ms.legislative.action.possible >= 1, 1,0) #Unusable

# Let's put all of the constraints into a final matrix
constraint.label.mat = data.frame(
  ms.rulemaking.requirements,
  ms.time.limit,
  ms.reporting.requirements,
  ms.consultation.requirements,
  ms.appeals.procedures,
  ms.executive.action.required,
  ms.spending.limits,
  ms.executive.action.possible
)

# Now we have to train 8 classifiers
classifiers = c()
constraint.performance.table = data.frame(c(0),c(0), c(0), c(0), c(0),c(0))
names(constraint.performance.table) = c("Constraint Type", "Accuracy", "Sensitivity", "Specificity", "F1","Precision")
classifiernames = c("xgb1","xgb2","xgb3","xgb4","xgb5","xgb6")

# Start loop here
for(i in 1:dim(constraint.label.mat)[2]){ 
  set.seed(42616)
  dtm_mat<-as.matrix(dtm.new)
  mllabel = data.frame(constraint.label.mat[,i],dtm_mat)
  train=sample(1:dim(mllabel)[1],
               dim(mllabel)[1]*0.70)
  dtm_mat<-as.matrix(dtm.new)
  trainX = dtm_mat[train,]
  testX = dtm_mat[-train,]
  trainY = constraint.label.mat[,i][train]
  testY = constraint.label.mat[,i][-train]

  traindata<-data.frame(trainY,trainX)
  testdata<-data.frame(testY,testX)

  traindata.b <- xgb.DMatrix(data = trainX,label = trainY) 
  testdata.b  <- xgb.DMatrix(data = testX,label=testY)

  pospredweight = as.vector(table(trainY)[1])/as.vector(table(trainY)[2])

  set.seed(100)

  # Parameter tuning
  # these are default parameters
  params <- list(booster = "gbtree", objective = "binary:logistic", 
               eta=0.3, gamma=0, max_depth=6, min_child_weight=1, 
               subsample=1, colsample_bytree=1)

  xgbcv<- xgb.cv( params = params, data = traindata.b,
                 nrounds = 200, nfold = 5, showsd = T, 
                 stratified = T, early.stopping.rounds = 20, print.every_n = 10,
                 maximize = F,
                 scale_pos_weight = pospredweight)
  
  ## Plot train and test error
  test.error = xgbcv$evaluation_log[,4]
  train.error = xgbcv$evaluation_log[,2]
  
  # Which number of iterations has the lowest training error?
  best.iter = which(xgbcv$evaluation_log$test_error_mean ==  min(xgbcv$evaluation_log$test_error_mean))
  best.iter = best.iter[1]

  #first default - model training
      xgb1 <- xgb.train(params = params, data = traindata.b, 
                  nrounds = best.iter, 
                  watchlist = list(val=testdata.b,train=traindata.b),
                  print.every_n = 10, early_stopping_rounds = 10, 
                  maximize = F , eval_metric = "error",
                  scale_pos_weight = pospredweight,
                  alpha=1)
  classifiers = c(assign(classifiernames[i],xgb1),classifiers)
  
  #model prediction
  xgbpred <- predict(xgb1,testdata.b)
  xgbpred <- ifelse(xgbpred > 0.5,1,0)

  cmat = confusionMatrix(factor(xgbpred), factor(testY),positive="1")
  
  F1 = round(as.numeric(cmat$byClass[7]),3)
  accuracy = round(as.numeric(cmat$overall[1]),3)
  sensitivity = round(as.numeric(cmat$byClass[1]),3)
  specificity = round(as.numeric(cmat$byClass[2]),3)
  constrainttype = toString(names(constraint.label.mat[i]))
  precision = round(as.numeric(cmat$byClass[3]),3)
  
  outvec = c(constrainttype, accuracy, sensitivity, specificity,F1,precision)
  constraint.performance.table = rbind(outvec, constraint.performance.table)
  
  # Produce a word importance plot for each category
  setwd("~/Dropbox/Research/Papers/Delegation-ML-Project/Draft-EU-Paper/figs/APSR_RnR")
  
  mat <- xgb.importance(feature_names = colnames(trainX),model = xgb1)
  
  png(paste("term-importance-constraint-ms",i,".png",sep=""))
  xgb.plot.importance(importance_matrix = mat[1:10],
                      xlab = "Information Gain",
                      ylab = "Term")
  dev.off()
  
  # Plot training and test error
  df = data.frame(
    Iteration = c(1:length(test.error$test_error_mean), 1:length(test.error$test_error_mean)),
    Error = c(test.error$test_error_mean,train.error$train_error_mean), 
    Type = c(rep("Test Error",length(test.error$test_error_mean)), 
             rep("Train Error",length(test.error$test_error_mean)))
  )
  
  ggplot(data=df, aes(x=Iteration, y=Error, colour=Type)) +
    geom_line()+
    geom_point() + theme_classic() +
    geom_vline(xintercept=best.iter,linetype="dotted") 
  ggsave(paste("traintest",i,".png",sep=""))
  
  # Store each trained classifier in the "Trained-Classifiers" directory
  classifiername = paste(constrainttype,"_classifier_ms",".RData",sep="")
  directory = "~/Dropbox/Research/Papers/Delegation-ML-Project/Trained-Classifiers/APSR_RnR/"
  save.image(
    paste(directory,classifiername,sep="")
  )
  
}

######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here


############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
##################################    Classifier Training Performance Delegation ###########################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
############################################################################################################
#### Need to incorporate codings here

delegation.MS = data.delegation.MS.new$delegation.member

dtm_mat<-as.matrix(dtm.new)
mllabel = data.frame(delegation.MS,dtm_mat)
train=sample(1:dim(mllabel)[1],
             dim(mllabel)[1]*0.95)
dtm_mat<-as.matrix(dtm.new)
trainX = dtm_mat[train,]
testX = dtm_mat[-train,]
trainY = delegation.MS[train]
testY = delegation.MS[-train]

traindata<-data.frame(trainY,trainX)
testdata<-data.frame(testY,testX)

traindata.b <- xgb.DMatrix(data = trainX,label = trainY) 
testdata.b  <- xgb.DMatrix(data = testX,label=testY)

pospredweight = as.vector(table(trainY)[1])/as.vector(table(trainY)[2])

set.seed(100)

# Parameter tuning
# these are default parameters
params <- list(booster = "gbtree", objective = "binary:logistic", 
               eta=0.3, gamma=0, max_depth=6, min_child_weight=1, 
               subsample=1, colsample_bytree=1)

xgbcv<- xgb.cv( params = params, data = traindata.b,
                nrounds = 500, nfold = 5, showsd = T, 
                stratified = T, early.stopping.rounds = 20, print.every_n = 10,
                maximize = F,
                scale_pos_weight = pospredweight)


## Plot train and test error
test.error = xgbcv$evaluation_log[,4]
train.error = xgbcv$evaluation_log[,2]

# Which number of iterations has the lowest training error?
best.iter = which(xgbcv$evaluation_log$test_error_mean ==  min(xgbcv$evaluation_log$test_error_mean))
best.iter = best.iter[1]

#first default - model training
xgb1 <- xgb.train(params = params, data = traindata.b, 
                  nrounds = best.iter, 
                  watchlist = list(val=testdata.b,train=traindata.b),
                  print.every_n = 10, early_stopping_rounds = 10, 
                  maximize = F , eval_metric = "error",
                  scale_pos_weight = pospredweight,
                  alpha=1)

#library(DiagrammeR)
#gr <- xgb.plot.tree(model=xgb1, trees=0:1, render=FALSE)

#model prediction
xgbpred <- predict(xgb1,testdata.b)
xgbpred <- ifelse(xgbpred > 0.5,1,0)

cmat = confusionMatrix(factor(xgbpred), factor(testY),positive="1")

accuracy = round(as.numeric(cmat$overall[1]),3)
sensitivity = round(as.numeric(cmat$byClass[1]),3)
specificity = round(as.numeric(cmat$byClass[2]),3)
constrainttype = toString(names(constraint.label.mat[i]))
F1 =  round(as.numeric(cmat$byClass[7]),3)
precision = round(as.numeric(cmat$byClass[3]),3)

outvec = c("delegation.ms", accuracy, sensitivity, specificity,F1,precision)
constraint.performance.table = rbind(outvec, constraint.performance.table)

# Produce a word importance plot for each category
setwd("/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/Draft-EU-Paper/figs/APSR_RnR")

mat <- xgb.importance(feature_names = colnames(trainX),model = xgb1)

png(paste("term-importance","-delegation-ms.png",sep=""))
xgb.plot.importance(importance_matrix = mat[1:10],
                    xlab = "Information Gain",
                    ylab = "Term")
dev.off()

# Plot training and test error
df = data.frame(
  Iteration = c(1:length(test.error$test_error_mean), 1:length(test.error$test_error_mean)),
  Error = c(test.error$test_error_mean,train.error$train_error_mean), 
  Type = c(rep("Test Error",length(test.error$test_error_mean)), 
           rep("Train Error",length(test.error$test_error_mean)))
)

ggplot(data=df, aes(x=Iteration, y=Error, colour=Type)) +
  geom_line()+
  geom_point() + theme_classic() +
  geom_vline(xintercept=best.iter,linetype="dotted") 
ggsave(paste("traintest","delegation.png",sep=""))

# Store each trained classifier in the "Trained-Classifiers" directory
  directory = "~/Dropbox/Research/Papers/Delegation-ML-Project/Trained-Classifiers/APSR_RnR/"
  save.image(
    paste(directory,"delegation-ms.RData",sep="")
  )

  ######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
  ######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
  ######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
  ######## End Loop here######## End Loop here######## End Loop here######## End Loop here######## End Loop here
  
  write.csv(constraint.performance.table,
            "/Users/jason/Dropbox/Research/Papers/Delegation-ML-Project/FINAL-OUTPUT-DATA/APSR_RnR/Final-Tables/MS-Performance-Final-Plus.csv")
  
